library(h5)
library(MASS)
library(dplyr)
library(ggplot2)
library(plotly)
library(Rtsne)
library(pheatmap)
library(data.table)
library(knitr)
library(RColorBrewer)
source("../r_functions/PublicationTheme_ggplot.R")
source("../r_functions/feature_class.R")
source("../r_functions/Color_Palette.R")
source("../r_functions/Replicates.R")
knitr::opts_chunk$set(cache = TRUE, warning = FALSE,
message = FALSE, cache.lazy = FALSE)
htmldir = "/collab-ag-fischer/PROMISE/data-10x-4t-c-16z/htmldir"
species = "human"
feature_type = "organoids"
lines = sort(get_lines(species))
colorScale = setNames(
object = get_color_palette(species, length(lines)),
nm = lines)
I reduce the features and select the most informative features using a PCA, i.e. I retain the features that contribute the most to the variance as measured by their relative contribution to each PC component.
dmso_dir = sprintf("%s/DMSO/results", species)
f = h5file(file.path(dmso_dir, sprintf("ReducedFeatures_%s_DMSO.h5", species)), "r")
features = data.frame(t(f[sprintf("features_%s", feature_type)][]))
colnames(features) = f[sprintf("feature_names_%s", feature_type)][]
metadata = data.frame(f[sprintf("metadata_%s", feature_type)][], stringsAsFactors = FALSE)
colnames(metadata) = f[sprintf("metadata_names_%s", feature_type)][]
h5close(f)
The features are sampled to ensure a balance between classes.
sample_size = min(table(metadata$Line))
features_organoids = features %>%
group_by("Line" = metadata$Line) %>%
sample_n(sample_size) %>% as.data.frame
I want to test several feature set sizes to compare them visually
expl_var_plots = list()
lda_proj_plots = list()
num_features_list = c(5, 10, 25, 50, 100, 200)
num_features_list = num_features_list[num_features_list < ncol(features)]
num_features_list = c(num_features_list, ncol(features))
for(num_features in num_features_list) {
model = lda(
x=features_organoids %>% select(-Line) %>%
select(seq_len(num_features)),
grouping=features_organoids$Line)
plda = predict(object = model, newdata = features_organoids %>% select(-Line) %>%
select(seq_len(num_features)))
transformed = as.data.frame(plda$x)
transformed$Line = features_organoids$Line
# Accuracy
acc = mean(as.character(plda$class) == features_organoids$Line)
print(sprintf("Accuracy for %s features = %s", num_features, acc))
# Explained Variance
expl_var_plots[[length(expl_var_plots) + 1]] = ggplot_df = data.frame(
"Component" = paste0("LD", seq_along(model$svd)),
"Value" = model$svd**2 / sum(model$svd**2),
"NumFeatures" = num_features)
# ggplot(data = ggplot_df) +
# geom_col(aes(x = Component, y = Value)) +
# xlab("") + ylab("") + ggtitle("Proportion of Variance") +
# theme_Publication() + scale_fill_Publication() +
# theme(legend.position = "none")
# 2D LDA Projection
transformed$NumFeatures = num_features
lda_proj_plots[[length(lda_proj_plots) + 1]] = transformed[, c("LD1", "LD2", "Line", "NumFeatures")]
# ggplot(data = transformed) +
# geom_point(aes(x = LD1, y = LD2, color = Line)) +
# theme_Publication() + scale_fill_Publication() +
# theme(legend.position = "right", legend.direction = "vertical",
# legend.key.size = unit(0.5, "cm"))
}
## [1] "Accuracy for 5 features = 0.549958574979287"
## [1] "Accuracy for 10 features = 0.705172209728962"
## [1] "Accuracy for 25 features = 0.799100485264528"
## [1] "Accuracy for 50 features = 0.871787588274747"
## [1] "Accuracy for 100 features = 0.924353966938888"
## [1] "Accuracy for 143 features = 0.931297589458319"
lda_proj_plots = do.call(rbind, lda_proj_plots)
expl_var_plots = do.call(rbind, expl_var_plots)
ggplot(data = lda_proj_plots) +
geom_point(aes(x = LD1, y = LD2, color = Line)) +
facet_wrap(facets = ~ NumFeatures) +
theme_Publication() + scale_color_manual(values = colorScale) +
theme(legend.position = "right", legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"))
A better separation becomes visible if features are averaged across wells
well_id = paste0(metadata$Plate, "_", metadata$Well)
features_wells = aggregate(x = features, by = list(well_id), FUN = median)
rownames(features_wells) = features_wells$Group.1
features_wells$Line = substr(features_wells$Group.1, 1, 7)
features_wells$Group.1 = NULL
metadata_wells = aggregate(x = metadata, by = list(well_id), FUN = first)
rownames(metadata_wells) = metadata_wells$Group.1
metadata_wells$Group.1 = NULL
expl_var_plots = list()
lda_proj_plots = list()
contribs = list()
num_features_list = c(5, 10, 25, 50, 100, 200)
num_features_list = num_features_list[num_features_list < ncol(features)]
num_features_list = c(num_features_list, ncol(features))
for(num_features in num_features_list) {
model = lda(
x=features_wells %>% select(-Line) %>%
select(seq_len(num_features)),
grouping=features_wells$Line)
plda = predict(object = model, newdata = features_wells %>% select(-Line) %>%
select(seq_len(num_features)))
transformed = as.data.frame(plda$x)
transformed$Line = features_wells$Line
# Accuracy
acc = mean(as.character(plda$class) == features_wells$Line)
print(sprintf("Accuracy for %s features = %s", num_features, acc))
# Explained Variance
expl_var_plots[[length(expl_var_plots) + 1]] = data.frame(
"Component" = paste0("LD", seq_along(model$svd)),
"Value" = model$svd**2 / sum(model$svd**2),
"NumFeatures" = num_features)
# ggplot(data = ggplot_df) +
# geom_col(aes(x = Component, y = Value)) +
# xlab("") + ylab("") + ggtitle("Proportion of Variance") +
# theme_Publication() + scale_fill_Publication() +
# theme(legend.position = "none")
# 2D LDA Projection
transformed$NumFeatures = num_features
lda_proj_plots[[length(lda_proj_plots) + 1]] = transformed[
, c("LD1", "LD2", "LD3", "Line", "NumFeatures")]
# ggplot(data = transformed) +
# geom_point(aes(x = LD1, y = LD2, color = Line)) +
# theme_Publication() + scale_fill_Publication() +
# theme(legend.position = "right", legend.direction = "vertical",
# legend.key.size = unit(0.5, "cm"))
# Feature contributions to LD1 and LD2
contrib = list()
for(ldc in colnames(model$scaling)) {
contrib[[ldc]] = model$scaling[,ldc]
contrib[[ldc]] = contrib[[ldc]][order(abs(contrib[[ldc]]), decreasing = TRUE)]
contrib[[ldc]] = data.frame(
"var" = names(contrib[[ldc]]),
"value" = contrib[[ldc]],
"LD" = ldc,
"NumFeatures" = num_features)
}
contribs[[length(contribs) + 1]] = do.call(rbind, contrib)
}
## [1] "Accuracy for 5 features = 0.892085076708508"
## [1] "Accuracy for 10 features = 0.976464435146444"
## [1] "Accuracy for 25 features = 0.990585774058577"
## [1] "Accuracy for 50 features = 0.998082287308229"
## [1] "Accuracy for 100 features = 0.998953974895398"
## [1] "Accuracy for 143 features = 0.998953974895398"
lda_proj_plots = do.call(rbind, lda_proj_plots)
expl_var_plots = do.call(rbind, expl_var_plots)
contribs = do.call(rbind, contribs)
ggplot(data = lda_proj_plots) +
geom_point(aes(x = LD1, y = LD2, color = Line)) +
facet_wrap(facets = ~ NumFeatures) +
theme_Publication() + scale_color_manual(values = colorScale) +
theme(legend.position = "right", legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"))
ggplot(data = lda_proj_plots) +
geom_point(aes(x = LD2, y = LD3, color = Line)) +
facet_wrap(facets = ~ NumFeatures) +
theme_Publication() + scale_color_manual(values = colorScale) +
theme(legend.position = "right", legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"))
expl_var_plots = list()
pca_plots = list()
contribs = list()
num_features_list = c(5, 10, 25, 50, 100, 200)
num_features_list = num_features_list[num_features_list < ncol(features)]
num_features_list = c(num_features_list, ncol(features))
for(num_features in num_features_list) {
model = prcomp(
x = features_wells %>% select(-Line) %>%
select(seq_len(num_features)),
scale. = TRUE, center = TRUE)
transformed = as.data.frame(model$x)
transformed$Line = features_wells$Line
# Explained Variance
expl_var_plots[[length(expl_var_plots) + 1]] = data.frame(
"Component" = paste0("PC", seq_along(model$sdev)),
"Value" = model$sdev / sum(model$sdev),
"NumFeatures" = num_features)
# 2D LDA Projection
transformed$NumFeatures = num_features
pca_plots[[length(pca_plots) + 1]] = transformed[, c("PC1", "PC2", "Line", "NumFeatures")]
}
pca_plots = do.call(rbind, pca_plots)
expl_var_plots = do.call(rbind, expl_var_plots)
ggplot(data = pca_plots) +
geom_point(aes(x = PC1, y = PC2, color = Line)) +
facet_wrap(facets = ~ NumFeatures) +
theme_Publication() + scale_color_manual(values = colorScale) +
theme(legend.position = "right", legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"))
tsne = Rtsne(features_wells %>% select(-Line), perplexity = 50)
ggplot_df = data.frame(
"Line" = features_wells$Line,
"X1" = tsne$Y[,1],
"X2" = tsne$Y[,2])
ggplot(data = ggplot_df) + geom_point(aes(x = X1, y = X2, color = Line)) +
theme_Publication() + xlab("X") + ylab("Y") +
theme(legend.position = "right", legend.direction = "vertical",
legend.key.size = unit(0.5, "cm")) +
scale_color_manual(values = colorScale)
I perform a clustering on the lines
# model = lda(
# x=features_wells %>% select(-Line),
# grouping=features_wells$Line)
# plda = predict(
# object = model,
# newdata = features_wells %>% select(-Line))
# transformed = as.data.frame(plda$x)
#
# # Explained Variance
# expl_var_plots = data.frame(
# "Component" = paste0("LD", seq_along(model$svd)),
# "Value" = model$svd**2 / sum(model$svd**2),
# "NumFeatures" = num_features)
model = prcomp(
x = features_wells %>% select(-Line),
scale. = TRUE, center = TRUE)
transformed = as.data.frame(model$x)
transformed$Line = features_wells$Line
features_lines = aggregate(
x = transformed %>% select(-Line),
by = list(features_wells$Line),
FUN = median)
rownames(features_lines) = features_lines$Group.1
features_lines$Group.1 = NULL
d = dist(features_lines)
hc = hclust(d, method = "ward.D2")
pheatmap(
mat = t(as.matrix(features_lines[, 1:5])), cluster_rows = FALSE,
cluster_cols = hc, cutree_cols = 6)
I repeat the clustering but for individual replicates of lines to test the screen quality.
rep_id = paste0(
metadata_wells$Line, "_",
substr(metadata_wells$Plate, 12, 14), "_",
metadata_wells$Replicate)
features_replicates = aggregate(
x = transformed %>% select(-Line),
by = list("Replicate.ID" = rep_id),
FUN = median)
rownames(features_replicates) = features_replicates$Replicate.ID
features_replicates$Replicate.ID = NULL
annotation = data.frame(
"Line" = substr(rownames(features_replicates), 1, 7),
row.names = rownames(features_replicates, 1, 7),
stringsAsFactors = FALSE)
anno_colorScale = list(
"Line" = setNames(
object = colorScale,
nm = sort(unique(annotation$Line))))
d = dist(features_replicates)
hc = hclust(d, method = "ward.D2")
pheatmap(
mat = t(as.matrix(features_replicates[, 1:5])),
cluster_rows = FALSE, cluster_cols = hc, annotation_col = annotation,
annotation_colors = anno_colorScale, show_colnames = FALSE,
border_color = NA)
ph = pheatmap(
mat = d, cluster_rows = hc, cluster_cols = hc,
annotation_col = annotation, # annotation_row = annotation,
annotation_colors = anno_colorScale,
annotation_names_col = FALSE, annotation_names_row = FALSE,
border_color = NA)
set.seed(100)
num_imgs = 3
metadata_wells = aggregate(x = metadata, by = list(well_id), FUN = first)
random_wells = metadata_wells %>% dplyr::group_by(Line) %>% sample_n(num_imgs)
all_imgs = list()
captions = c()
for(line in sort(unique(random_wells$Line))) {
subdat = random_wells[random_wells$Line == line, ]
imgs = list()
plates = c()
for(ii in seq_len(num_imgs)) {
plate = subdat[ii, "Plate"]
plates = c(plates, as.character(plate))
row = substr(subdat[ii, "Well"], 1, 1)
col = substr(subdat[ii, "Well"], 2, 3)
img_url = file.path(htmldir, plate, sprintf("%s_%s_%s_1.jpeg", plate, row, col))
imgs[[ii]] = EBImage::readImage(img_url)
}
all_imgs[[length(all_imgs)+1]] = EBImage::abind(imgs, along = 1)
captions = c(captions, paste0(plates, collapse = ", "))
}
EBImage::display(all_imgs[[1]])
Figure 1: D004T01P004L03, D004T01P004L03, D004T01P007L02
EBImage::display(all_imgs[[2]])
Figure 2: D007T01P003L02, D007T01P005L08, D007T01P005L08
EBImage::display(all_imgs[[3]])
Figure 3: D010T01P022L03, D010T01P018L03, D010T01P021L02
EBImage::display(all_imgs[[4]])